In [ ]:
import logging
import requests
from pprint import pprint
from requests import RequestException
from os import path
from bs4 import BeautifulSoup

from html2corpus import HTML2Corpus
from html2corpus.extractors import ReadabilityExtractor, ParagraphExtractor

def check(link, blackwords):
    return all([blackword not in link for blackword in blackwords])

Crawling german political party sites

Die Linke

News from the website of the party: http://www.die-linke.de/nc/die-linke/nachrichten


In [ ]:
domain = 'http://www.die-linke.de'
keyword = 'artikel'

site = 'http://www.die-linke.de/nc/die-linke/nachrichten'
pages = ['{}/browse/{}'.format(site, i) for i in range(1, 99)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke.txt'))

Press releases from the website of the party: http://www.die-linke.de/nc/presse/presseerklaerungen/presseerklaerungen


In [ ]:
domain = 'http://www.die-linke.de'
keyword = 'artikel'
site = 'http://www.die-linke.de/nc/presse/presseerklaerungen/presseerklaerungen'
pages = ['{}/browse/{}'.format(site, i) for i in range(1, 272)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke_PR.txt'))

Press releases from the faction: http://www.linksfraktion.de/pressemitteilungen


In [ ]:
domain = 'http://www.linksfraktion.de'
keyword = 'pressemitteilungen'
site = 'http://www.linksfraktion.de/pressemitteilungen'
pages = ['{}/?s={}'.format(site, i) for i in range(1, 1384)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if link.get('href') and keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke_Fraktion.txt'))

SPD

Press releases from the website of the faction: http://www.spdfraktion.de/presse/pressemitteilungen


In [ ]:
domain = 'http://www.spdfraktion.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.spdfraktion.de/presse/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 733)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and blackword not in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_SPD_Fraktion.txt'))

Press releases from the website of the eu faction: https://www.spd-europa.de/pressemitteilung


In [ ]:
domain = 'https://www.spd-europa.de'
keyword = '/pressemitteilungen/'
site = 'https://www.spd-europa.de/pressemitteilung'
pages = ['{}?page={}'.format(site, i) for i in range(1, 165)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content, 'lxml')
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'SPD_EU.txt'))

Vorwärts, a spd news paper.


In [ ]:
domain = 'http://www.vorwaerts.de'
keyword = '/artikel/'
blackwords = ['#comment-form']
site = 'http://www.vorwaerts.de/international'
pages = ['{}?page={}'.format(site, i) for i in range(1, 124)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content, 'lxml')
            links = [link['href'] for link in soup.findAll('a')
                        if link.get('href', None) and keyword in link['href'] and check(link['href'], blackwords)]
            links = map(lambda x: '{}{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'SPD_Vorwärts.txt'))

Grüne

Press releases from the website of the faction: http://www.gruene-bundestag.de/presse_ID_2000127


In [ ]:
domain = 'http://www.spdfraktion.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.spdfraktion.de/presse/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 733)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and blackword not in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_SPD_Fraktion.txt'))

In [ ]:
domain = 'http://www.gruene-bundestag.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.gruene-bundestag.de/presse_ID_2000127'
pages = ['{}/pb_id/100/seite/{}'.format(site, i) for i in range(2, 1322)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and blackword not in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_Grüne_Fraktion.txt'))

FDP

News from the website of the party: http://www.fdp.de/pressemitteilungen


In [ ]:
domain = 'http://www.fdp.de'
keyword = '/content/'
blackwords = set(['datenschutz', 'impressum'])
site = 'http://www.fdp.de/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 97)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_FDP.txt'))

In [ ]:
domain = 'http://www.liberale.de'
keyword = '/content/'
blackwords = set(['datenschutz', 'impressum'])
site = 'http://www.liberale.de/page/pressemitteilungen'
pages = ['{}?page=0%2C{}'.format(site, i) for i in range(1, 1063)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_FDP_Fraktion.txt'))

CDU

CDU/CSU Fraktion


In [ ]:
domain = 'http://www.presseportal.de'
keyword = '/pm/7846/'
site = 'http://www.presseportal.de/nr/7846'
pages = ['{}/{}'.format(site, i * 27) for i in range(1, 621)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU_Fraktion.txt'))

In [ ]:
domain = 'http://www.presseportal.de'
keyword = '/pm/6518/'
site = 'http://www.presseportal.de/nr/6518'
pages = ['{}/{}'.format(site, i * 27) for i in range(1, 38)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU.txt'))

In [ ]:
domain = 'http://www.cdu-csu-ep.de'
keyword = '/presse/pressemitteilungen/'
blackwords = set(['content'])
site = 'http://www.cdu-csu-ep.de/pressearchiv.html'
pages = ['{}?start={}'.format(site, i * 5) for i in range(0, 643)]

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if link.get('href', None) and keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU_EU.txt'))

NPD


In [ ]:
keyword = '/?p='
site = 'http://aktion-widerstand.de/?page_id=11042'
pages = ['{}&paged={}'.format(site, i) for i in range(2, 335)]
pages.append(site)

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href']])
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_Jung.txt'))

In [ ]:
domain = 'http://www.npd-fraktion-mv.de'
keyword = '&view=article&'
blackwords = set(['content'])
site = 'http://www.npd-fraktion-mv.de/index.php?com=news&view=archive'
pages = ['{}&b={}&mid=8'.format(site, i * 50) for i in range(0, 38)]

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            links = set([link['href'] for link in soup.findAll('a')
                     if keyword in link['href'] and check(link['href'], blackwords)])
            links = map(lambda x: '{}/{}'.format(domain, x), links)
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_MV.txt'))

In [ ]:
domain = 'http://www.npd-fraktion-sachsen.de'
blackwords = set(['meldungen', 'category', 'author'])
site = 'http://www.npd-fraktion-sachsen.de/category/meldungen'
pages = ['{}/page/{}'.format(site, i) for i in range(2, 194)]

def get_data():
    for page in pages:
        try:
            req = requests.get(page, timeout=10)
            soup = BeautifulSoup(req.content)
            blog = soup.find('div', id='blog-left')
            links = set([link['href'] for link in blog.findAll('a')
                        if check(link['href'], blackwords)])
            for article in list(links):
                article_req = requests.get(article)
                yield article_req.content
        except RequestException as error:
            logging.error('Error: %s', error)
            
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_Sachsen.txt'))